import re
from bs4 import BeautifulSoup
from book import Book


class AiShangBa_Org(Book):
    '''新笔趣阁网站小说爬虫'''

    def _get_title(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        title = soup.find('h1').text
        return title

    def _get_author(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        author = soup.find('div', id='info').find_all('p')[
            0].text.split('者：')[1]
        return author

    def _get_article_urls(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        _list = soup.find('div', id='list').find_all('a')
        _dict = dict()
        for a in _list:
            title = a.text
            _id = re.findall(r'\d+', title)[0].zfill(4)
            title = title.split('章')[1].strip()
            title = f'第{_id}章 {title}'.replace('?', '？').replace(':', '：')
            url = self._domain + a['href']
            _dict[title] = url
        return _dict

    def _remove_content_invalid_chars(self, content):
        reFind = re.compile('由于各种问题.*免费阅读。https://www.aishangba.org', re.S)
        content = re.sub(reFind, '', content)
        content = content.replace('m.aishangba.org', '')
        return content
